knitr::opts_chunk$set(echo = TRUE)

#Setting Working Directory
knitr::opts_knit$set(root.dir = "~/Desktop/Final Project/")
getwd()
## [1] "/Users/christian3/Desktop/Final Project"

#Install and Intialize Packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.1
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggthemes)
library(gifski)
library(gganimate)
library(png)
library(extrafont)
## Registering fonts with R
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(RColorBrewer)
library(tidyquant)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(vars)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: strucchange
## Loading required package: sandwich
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## Loading required package: urca
## Loading required package: lmtest
## 
## Attaching package: 'vars'
## 
## The following object is masked from 'package:tidyquant':
## 
##     VAR
library(sarima)
## Loading required package: stats4
## 
## Attaching package: 'sarima'
## 
## The following object is masked from 'package:stats':
## 
##     spectrum
library(lmtest)

#Read CSV

setwd("~/Desktop/Final Project/")
NYC <-read.csv("NYC_Property_Sales_Data.csv")
NYC

#Summary

summary(NYC)
##        X            BOROUGH      NEIGHBORHOOD       BUILDING.CLASS.CATEGORY
##  Min.   :    4   Min.   :1.000   Length:84548       Length:84548           
##  1st Qu.: 4231   1st Qu.:2.000   Class :character   Class :character       
##  Median : 8942   Median :3.000   Mode  :character   Mode  :character       
##  Mean   :10344   Mean   :2.999                                             
##  3rd Qu.:15987   3rd Qu.:4.000                                             
##  Max.   :26739   Max.   :5.000                                             
##  TAX.CLASS.AT.PRESENT     BLOCK            LOT         EASE.MENT     
##  Length:84548         Min.   :    1   Min.   :   1.0   Mode:logical  
##  Class :character     1st Qu.: 1323   1st Qu.:  22.0   NA's:84548    
##  Mode  :character     Median : 3311   Median :  50.0                 
##                       Mean   : 4237   Mean   : 376.2                 
##                       3rd Qu.: 6281   3rd Qu.:1001.0                 
##                       Max.   :16322   Max.   :9106.0                 
##  BUILDING.CLASS.AT.PRESENT   ADDRESS          APARTMENT.NUMBER  
##  Length:84548              Length:84548       Length:84548      
##  Class :character          Class :character   Class :character  
##  Mode  :character          Mode  :character   Mode  :character  
##                                                                 
##                                                                 
##                                                                 
##     ZIP.CODE     RESIDENTIAL.UNITS  COMMERCIAL.UNITS     TOTAL.UNITS      
##  Min.   :    0   Min.   :   0.000   Min.   :   0.0000   Min.   :   0.000  
##  1st Qu.:10305   1st Qu.:   0.000   1st Qu.:   0.0000   1st Qu.:   1.000  
##  Median :11209   Median :   1.000   Median :   0.0000   Median :   1.000  
##  Mean   :10732   Mean   :   2.025   Mean   :   0.1936   Mean   :   2.249  
##  3rd Qu.:11357   3rd Qu.:   2.000   3rd Qu.:   0.0000   3rd Qu.:   2.000  
##  Max.   :11694   Max.   :1844.000   Max.   :2261.0000   Max.   :2261.000  
##  LAND.SQUARE.FEET   GROSS.SQUARE.FEET    YEAR.BUILT   TAX.CLASS.AT.TIME.OF.SALE
##  Length:84548       Length:84548       Min.   :   0   Min.   :1.000            
##  Class :character   Class :character   1st Qu.:1920   1st Qu.:1.000            
##  Mode  :character   Mode  :character   Median :1940   Median :2.000            
##                                        Mean   :1789   Mean   :1.657            
##                                        3rd Qu.:1965   3rd Qu.:2.000            
##                                        Max.   :2017   Max.   :4.000            
##  BUILDING.CLASS.AT.TIME.OF.SALE  SALE.PRICE         SALE.DATE        
##  Length:84548                   Length:84548       Length:84548      
##  Class :character               Class :character   Class :character  
##  Mode  :character               Mode  :character   Mode  :character  
##                                                                      
##                                                                      
## 

#Cleaning

NYC <- NYC %>% 
  mutate(SALE.PRICE=as.numeric(SALE.PRICE))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `SALE.PRICE = as.numeric(SALE.PRICE)`.
## Caused by warning:
## ! NAs introduced by coercion
NYC <- NYC%>%
    mutate(SALE.DATE=as.Date(SALE.DATE),'%m/%d/%Y')
NYC<- NYC %>%
  dplyr::select("BOROUGH","SALE.PRICE","SALE.DATE")
NYC<- NYC %>% 
  drop_na()
filter(NYC, SALE.PRICE > 0)
NYC$BOROUGH = case_when(
  NYC$BOROUGH  == "1" ~ "Manhatten",
  NYC$BOROUGH  == "2" ~ "Bronx",
  NYC$BOROUGH  == "3" ~ "Brooklyn",
  NYC$BOROUGH  == "4" ~ "Queens",
  NYC$BOROUGH  == "5" ~ "Staten Island",
)
NYC <- NYC %>% 
    group_by(month = lubridate::floor_date(SALE.DATE, 'month'), BOROUGH) %>%
    summarize(
      average_price = mean(SALE.PRICE))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
NYC <- NYC %>% 
       rename("borough" = "BOROUGH")
NYC
summary(NYC)
##      month              borough          average_price    
##  Min.   :2016-09-01   Length:60          Min.   : 423580  
##  1st Qu.:2016-11-23   Class :character   1st Qu.: 565888  
##  Median :2017-02-15   Mode  :character   Median : 721588  
##  Mean   :2017-02-14                      Mean   :1203620  
##  3rd Qu.:2017-05-08                      3rd Qu.: 919570  
##  Max.   :2017-08-01                      Max.   :4709871

#Simple Plot

ggplot(NYC, aes(x=month, y=average_price, color=borough))+
  scale_y_continuous( breaks = seq(from = 0,to = 5000000, by = 500000), labels =label_dollar()) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  scale_x_date(date_labels="%b %Y",date_breaks  = "1 month")+
  geom_line()+
  labs(title="NYC Average Property Sale Price")

#FacetWrap For Each Borough

ggplot(NYC, aes(x=month, y=average_price, color=borough))+
  scale_y_continuous( breaks = seq(from = 0,to = 5000000, by = 500000), labels = label_dollar()) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  scale_x_date(date_labels="%b %Y",date_breaks  = "1 month")+
  geom_line()+
  labs(title="NYC Average Property Sale Price")+
  facet_wrap(~borough)

#Plotting Graph + Animation

video = NYC %>% 
  ggplot(aes(x=month, y=average_price, color=borough)) +
  geom_line(linewidth=2, alpha=0.75) + #thick lines
  theme_solarized_2(light=FALSE) + # Change style of lighting
  labs(title= "NYC Real Estate Sales 2016-2017",
       y="Sale Price ($ USD)") +
  theme(text=element_text(family="Courier", colour="#EEEEEE"),
        title=element_text(color= "#EEEEEE"),
        axis.title.x = element_blank(), # getting rid of x axis label
        panel.background=element_rect(fill=NA), # getting rid of grid lines
        plot.background=element_rect(fill="#111111"), # replace grid lines with darker tones
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(), #eliminating grid lines again
        legend.background=element_blank(), # getting rid of legend background
        legend.key=element_blank(), # getting rid of legend key
        legend.position = "bottom", # moving legend to bottom
        plot.title= element_text(hjust=0.5),
        axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  scale_colour_brewer(palette = "BuGn") +
  geom_point()+  # set points on lines
  scale_x_date(date_labels="%b '%y",date_breaks  = "1 month")+
  scale_y_continuous( breaks = seq(from = 0,to = 5000000, by = 500000), labels =label_dollar()) 

video

#Animate

video.animation = video +
  transition_reveal(month) +
  view_follow(fixed_y=TRUE) #set y axis as fixed by move x axis
animate(video.animation,height=500, width=800,fps=30,duration=10,end=60,res=100)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

#TimeSeries - Cleaning Data

setwd("~/Desktop/Final Project/")
NYC <-read.csv("NYC_Property_Sales_Data.csv")
NYC <- NYC %>% 
  mutate(SALE.PRICE=as.numeric(SALE.PRICE))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `SALE.PRICE = as.numeric(SALE.PRICE)`.
## Caused by warning:
## ! NAs introduced by coercion
NYC <- NYC%>%
    mutate(SALE.DATE=as.Date(SALE.DATE),'%m/%d/%Y')
NYC<- NYC %>%
  dplyr::select("BOROUGH","SALE.PRICE","SALE.DATE")
NYC<- NYC %>% 
  drop_na()
filter(NYC, SALE.PRICE > 0)
NYC <- NYC %>% 
    group_by(month = lubridate::floor_date(SALE.DATE, 'month'), BOROUGH) %>%
    summarize(
      average_price = mean(SALE.PRICE))
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
NYC = subset(NYC, NYC$BOROUGH < 2)
summary(NYC)
##      month               BOROUGH  average_price    
##  Min.   :2016-09-01   Min.   :1   Min.   :2561120  
##  1st Qu.:2016-11-23   1st Qu.:1   1st Qu.:2771101  
##  Median :2017-02-15   Median :1   Median :3017874  
##  Mean   :2017-02-14   Mean   :1   Mean   :3320867  
##  3rd Qu.:2017-05-08   3rd Qu.:1   3rd Qu.:3764480  
##  Max.   :2017-08-01   Max.   :1   Max.   :4709871

#Creating 2 Different Data Frames

train <- NYC%>% filter(month < "2017-03-01")
hold_out <- NYC%>% filter(month >= "2017-03-01")

#Graphing Data Frame Before Prediction

ggplot(train) +
  geom_line(aes(month, average_price)) +
  labs(title = "Average Real Estate Price in Manhatten")

#ARIMA Modeling

arima_model <- arima(train$average_price, c(2, 1, 0), method = "ML")
arima_model
## 
## Call:
## arima(x = train$average_price, order = c(2, 1, 0), method = "ML")
## 
## Coefficients:
##          ar1      ar2
##       0.4948  -0.8577
## s.e.  0.3150   0.1609
## 
## sigma^2 estimated as 1.83e+11:  log likelihood = -73.29,  aic = 152.59
coeftest(arima_model)
## 
## z test of coefficients:
## 
##     Estimate Std. Error z value  Pr(>|z|)    
## ar1  0.49475    0.31502  1.5705    0.1163    
## ar2 -0.85768    0.16090 -5.3305 9.793e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#AR Performance

arima_prediction <- predict(arima_model, n.ahead = 12)
arima_prediction
## $pred
## Time Series:
## Start = 7 
## End = 18 
## Frequency = 1 
##  [1] 2310117 3494560 4295849 3676415 2682695 2722326 3594230 3991616 3440409
## [10] 2826866 2996073 3606015
## 
## $se
## Time Series:
## Start = 7 
## End = 18 
## Frequency = 1 
##  [1]  427757.9  769285.5  856793.4  859331.1  870163.4  974574.8 1108175.0
##  [8] 1151346.3 1158646.1 1180124.8 1255092.1 1332289.3

#Plotting our ARIMA Prediction

ggplot(cbind(hold_out[1:12,c("month", "average_price")], as.data.frame(arima_prediction)), aes(x = month)) +
  geom_ribbon(aes(ymin = pred - se, ymax = pred + se), alpha = 0.25, fill = scales::muted("green")) +
  geom_line(aes(y = pred), lty = 2) +
  geom_line(aes(y = average_price)) +
  scale_y_continuous(breaks = seq(from = 0,to = 5000000, by = 500000), labels =label_dollar()) +
  labs(title = "ARIMA prediction of Average Real Estate Price in Manhatten", subtitle = "Actual = solid, prediciton = dashed, se = green")
## Warning: Removed 6 rows containing missing values (`geom_line()`).
## Removed 6 rows containing missing values (`geom_line()`).